#!/usr/bin/env bash
# 定义脚本当前所在路径
base_dir=$(cd `dirname "$0"`; pwd)
# 定义配置文件的路径和名称
conf_file="${base_dir}/config/conf.sh"
# 定义日志存储目录
log_dir="${base_dir}/logs"
# 定义标准日志文件名称
log_file="${log_dir}/$(date +%Y-%m-%d)-INFO.log"
# 定义告警日志文件名称
warn_log="${log_dir}/$(date +%Y-%m-%d)-WARN.log"
# 定义时间格式
time_style="$(date +%Y/%m/%d-%T%z)"
# 定义 df 命令的参数，可以根据实际情况进行修改
df_cmd="df -Th -x devtmpfs -x tmpfs -x debugfs -x aufs -x overlay -x fuse.glusterfs"
# 定义日志压缩时间，数字表示多少天
tar_time=7
# 定义日志压缩路径
tar_dir=$(date +%Y-%m-%d -d "${tar_time} days ago")
# 定义 tar 包名称
tar_name="${tar_dir}.tgz"

function check_config () {
  # 检查配置文件是否存在
  if [[ -f "${conf_file}" ]];then
    # 调用配置文件内的变量
    source ${conf_file}
  
    # disk_lists 变量值为空，则 disk_lists 变量值默认为 / 
    disk_lists=${disk_lists:-'/'}
  
    # cpu_limit 变量值为空，则 cpu_limit 变量值默认为 85%
    cpu_limit=${cpu_limit:-'85%'}
  
    # mem_limit 变量值为空，则 mem_limit 变量值默认为 85%
    mem_limit=${mem_limit:-'85%'}
  
    # disk_limit 变量值为空，则 disk_limit 变量值默认为 75%
    ## 因为 kubelet 默认的驱逐机制是磁盘使用率超过 85%
    disk_limit=${disk_limit:-'75%'}
  
    # disk_inode_limit 变量值为空，则 disk_inode_limit 变量值默认为 85%
    disk_inode_limit=${disk_inode_limit:-'85%'}
  
    # api_cert_file 变量值为空，则 api_cert_file 变量值默认为 /etc/kubernetes/pki/apiserver.crt
    api_cert_file=${api_cert_file:-'/etc/kubernetes/pki/apiserver.crt'}

    # cert_expires 变量值为空，则 cert_expires 变量值默认为 30
    cert_expires=${cert_expires:-'30'}

    # kube_config 变量值为空，则 kube_config 变量值默认为 /root/.kube/config
    kube_config=${kube_config:-'/root/.kube/config'}
    kube_cmd="kubectl --kubeconfig ${kube_config}"
  else
    # 配置文件不存在则退出脚本，并告知配置文件不存在
    echo "${conf_file} is not found, please check it !"
    exit 0
  fi
}

function check_user () {
  local wai=$(id -u -n)

  # 当前用户不是 root 则退出脚本，并告知需要使用 root 用户执行
  if [[ "${wai}"x != "root"x ]];then
    printf "\e[1;31mPlease use the root to execute this shell !\e[0m\n"
    exit 0
  fi
}

function print_terminal () {
  printf "\e[1;34m[INFO] [${time_style}] ${*}\e[0m\n"
}

function print_info_title () {
  if [[ ! -f "${log_file}" ]];then
    echo "===================== ${*} =====================" >> ${log_file}
  else
    echo " " >> ${log_file}
    echo "===================== ${*} =====================" >> ${log_file}
  fi
}

function print_warn_title () {
  if [[ ! -f "${warn_log}" ]];then
    echo "===================== ${*} =====================" >> ${warn_log}
  else
    echo " " >> ${warn_log}
    echo "===================== ${*} =====================" >> ${warn_log}
  fi
}

function check_warn_title () {
  grep "${*}" ${warn_log} &> /dev/null || print_warn_title "${*}"
}

function print_info () {
  # 标准日志输出格式
  echo "[INFO] [${time_style}] ${*}" >> ${log_file}
}

function print_warn () {
  # 告警日志输出格式
  echo "[WARN] [${time_style}] ${*}" >> ${warn_log} 
}

function check_log_dir () {
  # 检查日志目录是否存在
  [[ -d ${log_dir} ]] || mkdir -p ${log_dir}
  # 检查当天巡检日志文件是否存在
  [[ ! -f ${log_file} ]] || mv ${log_file}{,-$(date +%T%z)}
  [[ ! -f ${warn_log} ]] || mv ${warn_log}{,-$(date +%T%z)}
  print_info_title "${time_style}"
  print_warn_title "${time_style}"
}

function check_tar () {
  # 判断指定时间之前是否存在日志文件，存在日志文件则对文件进行压缩
  ## 修改 tar_time 变量可以指定天数
  local check_num=$(find ${log_dir} -mtime +${tar_time} -name *.log* | wc -l)
  # 判断指定时间之前是否存在打包文件，存在则删除
  local check_tarnum=$(find ${log_dir} -mtime +${tar_time} -name *.tar.gz | wc -l)

  # 判断指定天数前的文件数量，大于等于 1 的情况下才做处理
  if [[ "${check_num}" > 0 ]];then
    [[ -d "${log_dir}/${tar_dir}" ]] || mkdir -p "${log_dir}/${tar_dir}"
    [[ ! -f "${log_dir}/${tar_dir}/${tar_name}" ]] || mv ${log_dir}/${tar_dir}/${tar_name}{,-$(date +%T%z)}
  
    find ${log_dir} -mtime +${tar_time} -name *.log* -exec mv {} ${log_dir}/${tar_dir} \; &> /dev/null
    cd ${log_dir} && tar czf ${tar_name} ${tar_dir}/* && rm -rf ${tar_dir}
  fi

  # 判断指定天数之前的打包文件梳理，大于等于 1 的情况下才做处理
  if [[ "${check_tarnum}" > 0 ]];then
    find ${log_dir} -mtime +${tar_time} -name *.tar.gz -exec rm -f {} \;
  fi

  print_terminal "check logs done"
}

function check_system () {
  # 系统相关信息检查
  print_info_title 'check system'

  # 主机名
  get_hostname="$(cat /etc/hostname)"

  print_info "Hostname: ${get_hostname}"

  # ip 地址 [银联有双网卡的情况，并且无法使用 hostname -i 命令获取 ip 地址]
  ## k8s 全部使用的主机名，因此改用过滤 hosts 解析文件的方式来获取 ip 地址
  local get_host_ip=$(hostname -i)

  print_info "Ipaddress: ${get_host_ip}"

  # 发行版
  local get_os_release="$(awk -F '"' '/PRETTY_NAME/ {print $2}' /etc/os-release)"

  print_info "Os-release: ${get_os_release} $(uname -o)"

  # 内核
  local get_kernel="$(uname -srmo)"

  print_info "Kernel: ${get_kernel}"

  # 服务器启动时长
  local get_up_secs="$(awk -F '.' '{print $1}' /proc/uptime)"
  local get_days="$(( ${get_up_secs} / 60 / 60 / 24 ))"

  print_info "Up Days: ${get_days} days"

  # 语言
  local os_lang=$(echo $LANG)

  print_info "Os Language: ${os_lang}"

  # swap 是否关闭
  local chech_swap=$(grep -iv size /proc/swaps | wc -l)

  if [[ "${chech_swap}" == "0" ]];then
    print_info "Swap Status: off"
  else
    check_warn_title 'check system'
    swapoff -a
    print_info "Swap Status: manual off"
  fi

  # firewalld 是否关闭
  local firewalld_status=$(systemctl is-active firewalld)
  local firewalld_enable=$(systemctl is-enabled firewalld)

  if [[ "${firewalld_status}"x == "inactive"x ]];then
    print_info "Firewalld Status: dead"
  else
    check_warn_title 'check system'
    systemctl stop firewalld
    print_warn "Firewalld Status: manual dead"
  fi

  if [[ "${firewalld_enable}"x == "disabled"x ]];then
    print_info "Firewalld Enabled: disabled"
  else
    check_warn_title 'check system'
    systemctl disable firewalld
    print_warn "Firewalld Enabled: manual disabled"
  fi

  print_terminal "check system done"
}

function check_cpu () {
  print_info_title "check cpu"
  # cpu 信息
  local physical_cpus="$(grep "^physical id" /proc/cpuinfo | sort | uniq | wc -l)"
  local process_cpus="$(grep -c "^processor" /proc/cpuinfo)"
  local core_cpus="$(grep '^cpu cores' /proc/cpuinfo | tail -1 | awk '{print $NF}')"
  local cpu_model="$(grep "^model name" /proc/cpuinfo | awk -F ': ' '{print $2}' | sort | uniq)"

  print_info "CPU Model: ${cpu_model}"
  print_info "Physical CPUS: ${physical_cpus}"
  print_info "Processor CPUS: ${process_cpus}"
  print_info "CPU Cores: ${core_cpus}"

  # cpu 负载
  local one_min="$(awk '{print $1}' /proc/loadavg)"
  local five_min="$(awk '{print $2}' /proc/loadavg)"
  local fif_min="$(awk '{print $3}' /proc/loadavg)"

  print_info "Load Average: ${one_min} , ${five_min} , ${fif_min}"

  # 检查 cpu 使用率
  local cpu_util="$(awk '/cpu / {util=($2+$4)*100/($2+$4+$5); printf ("%.2f%"), util}' /proc/stat)"

  print_info "CPU Utilization: ${cpu_util}"

  # cpu 使用率超过 cpu_limit 配置的数值，打印 WARN 日志
  if [[ "${cpu_util%%.*}" -ge "${cpu_limit%%%}" ]];then
    local top_cpu_use="$(ps -eo user,pid,pcpu,args --sort=-pcpu | head -n 10)"

    check_warn_title 'check cpu'
    print_warn "CPU utilization is ${cpu_util} , it's greater equal ${cpu_limit}, should be check !"
    # CPU 使用前十进程
    print_warn "Top 10 CPU Use: "
    echo "${top_cpu_use}" >> ${warn_log}
  fi

  print_terminal "check cpu done"
}

function check_mem () {
  print_info_title "check memory"
  # 检查内存使用率
  local get_mem_info="$(awk '/MemTotal:/{total=$2/1024/1024;next} /MemAvailable:/{available=$2/1024/1024;use=total-available; printf("%.2fGiB %.2fGiB %.2fGiB %.2f%"),total,use,available,(use/total)*100}' /proc/meminfo)"
  # 内存总大小
  local mem_total="$(awk '{print $1}' <<< ${get_mem_info})"
  # 已使用的内存大小
  local mem_used="$(awk '{print $2}' <<< ${get_mem_info})"
  # 可以内存的大小
  local mem_available="$(awk '{print $3}' <<< ${get_mem_info})"
  # 使用中内存的大小
  local mem_util="$(awk '{print $4}' <<< ${get_mem_info})"
  # 内存使用率最高的十个进程
  local top_mem_use="$(ps -eo user,pid,pmem,args --sort=-pmem | head -n 10)"

  print_info "Mem Total: ${mem_total}"
  print_info "Mem Used: ${mem_used}"
  print_info "Mem Available: ${mem_available}"
  print_info "Mem Utilization: ${mem_util}"

  # 内存使用率超过 mem_limit 配置的数值，打印 WARN 日志
  if [[ "${mem_util%%.*}" -ge "${mem_limit%%%}" ]];then
    check_warn_title 'check memory'
    print_warn "Mem utilization is ${mem_util}, it's greater equal ${mem_limit}, should be check !"
    # 内存使用前十进程
    print_warn "Top 10 Mem Use: "
    echo "${top_mem_use}" >> ${warn_log}
  fi

  print_terminal "check memory done"
}

function check_disk () {
  print_info_title "check disk"
  print_info "Disk Info: "
  # 检查磁盘使用率
  local disk_lists_array=($(printf "%q\n" ${disk_lists}))

  for (( i=0; i<${#disk_lists_array[@]}; i++ ))
  do
    local disk_info=$(${df_cmd} | egrep "${disk_lists_array[i]}$")
    # df 使用了 -T 参数，因此使用率是第 6 列，如果有修改 df 参数，注意确认使用率的列数，并修改下面的位置变量
    local disk_util="$(awk '{print $6}' <<< ${disk_info})"
    local disk_name="$(awk '{print $NF}' <<< ${disk_info})"

    [[ "${disk_info}"x != ""x ]] || break

    print_info "${disk_info}"

    # 磁盘使用率超过 disk_limit 配置的数值，打印 WARN 日志
    if [[ "${disk_util%%%}" -ge "${disk_limit%%%}" ]];then
      check_warn_title 'check disk'
      print_warn "Disk ${disk_name} utilization is ${disk_util}, it's greater equal ${disk_limit}, should be check !"
    fi
  done

  # 检查 inode 使用率
  print_info '---'
  print_info "Disk Inode Info: "
  
  for (( i=0; i<${#disk_lists_array[@]}; i++ ))
  do
    local disk_inode_info=$(${df_cmd} -i | egrep "${disk_lists_array[i]}$")
    # df 使用了 -T 参数，因此使用率是第 6 列，如果有修改 df 参数，注意确认使用率的列数，并修改下面的位置变量
    local disk_inode_util="$(awk '{print $6}' <<< ${disk_inode_info})"
    local disk_inode_name="$(awk '{print $NF}' <<< ${disk_inode_info})"

    [[ "${disk_inode_info}"x != ""x ]] || break

    print_info "${disk_inode_info}"

    # 磁盘 inode 使用率超过 disk_limit 配置的数值，打印 WARN 日志
    if [[ "${disk_inode_util%%%}" -ge "${disk_inode_limit%%%}" ]];then
      check_warn_title 'check disk'
      print_warn "Disk ${disk_inode_name} utilization is ${disk_inode_util}, it's greater equal ${disk_inode_limit}, should be check !"
    fi
  done

  print_terminal "check disk done"
}

function check_kubernetes () {
  print_info_title "check kubernetes"
  
  if [[ -f ${api_cert_file} ]];then
    # apiserver 证书到期时间
    local cert_info="$(openssl x509 -in ${api_cert_file} -noout -text | awk -F ': ' '/Not After/ {print $2}')"
    local cert_time_stamp=$(date -d "${cert_info}" +%s)
    local cert_not_after="$(( (${cert_time_stamp} - $(date +%s)) / 86400 ))"
  
    print_info "Apiserver Cert Not After: ${cert_info}"
  
    if [[ "${cert_not_after}" -le "${cert_expires}" ]];then
      check_warn_title 'check kubernetes'
      print_warn "The apiserver cert will expire in ${cert_expires} days, please renewal !"
    fi
  fi
  
  if [[ -f "${kube_config}" ]];then
    # 节点是否都为 Ready 状态
    local k8s_nodes_lists=$(${kube_cmd} get node --no-headers=true | awk '{print $1}')
    local k8s_lists_array=($(printf "%q\n" ${k8s_nodes_lists}))

    for (( h=0; h<${#k8s_lists_array[@]}; h++ ))
    do
      local node_status=$(${kube_cmd} get nodes | awk "/${k8s_lists_array[h]}/ {print \$2}")

      if [[ "${node_status}"x == "Ready"x ]];then
        print_info "Node Status: ${k8s_lists_array[h]} is Ready"
      else
        check_warn_title 'check kubernetes'
        print_warn "Node: ${k8s_lists_array[h]} is NotReady , please check !"
      fi
    done

    # top node 查看 k8s 集群资源使用情况
    ${kube_cmd} top node &> /dev/null
    if [[ "$?" -eq '0' ]];then
      for (( tn=0; tn<${#k8s_lists_array[@]}; tn++ ))
      do
        local k_top_node=$(${kube_cmd} top node | awk "/${k8s_lists_array[tn]}/ {print \$0}")
        local node_cpu_usage="$(awk '{print $3}' <<< ${k_top_node})"
        local node_mem_usage="$(awk '{print $5}' <<< ${k_top_node})"

        print_info "Top Nodes: ${k_top_node}"

        if [[ "${node_cpu_usage%%%}" -ge "${cpu_limit%%%}" ]];then
          check_warn_title 'check kubernetes'
          print_warn "${k8s_lists_array[tn]} top node check: cpu usage is ${node_cpu_usage}, it's greater equal ${cpu_limit}, should be check !"
        fi

        if [[ "${node_mem_usage%%%}" -ge "${mem_limit%%%}" ]];then
          check_warn_title 'check kubernetes'
          print_warn "${k8s_lists_array[tn]} top node check: cpu usage is ${node_mem_usage}, it's greater equal ${mem_limit}, should be check !"
        fi
      done
    fi
  else
    print_info "This node's role is the work for kubernetes cluster"
  fi
}

check_config
check_user
check_log_dir
check_tar
check_system
check_cpu
check_mem
check_disk
check_kubernetes

